# Merging the plant data with the load data
library(pacman)
p_load(
  here, fst, data.table, stringr, janitor, lubridate, purrr, 
  collapse
)

merge_plant_load_nerc = function(nerc_adj_in, eload_var = 'excess_load'){
# Loading data ----------------------------------------------------------------
  print('Loading data')
  # BA NERC crosswalk 
  ba_nerc_xwalk = fread(
    file = here('Data/electricity-generation/ba-nerc-xwalk-oge.csv')
  )
  # Folder with OGE data
  oge_path = here('Data/electricity-generation/open-grid-emissions/v0.3.0')
  # List of raw data files 
  oge_plant_files_all = c(
      list.files(here(oge_path,'2019_plant_data_hourly_us_units'), full.names = TRUE),
      list.files(here(oge_path,'2020_plant_data_hourly_us_units'), full.names = TRUE)
  ) 
  # Filtering to just BA's in nerc region
  oge_plant_files_dt = 
    data.table(
      year = str_extract(oge_plant_files_all, "(?<=/)\\d{4}(?=_)"),
      ba_code = str_extract(oge_plant_files_all, "(?<=/)\\w*(?=\\.csv)"),
      file = oge_plant_files_all
    ) |>
    join(
      ba_nerc_xwalk, 
      on = 'ba_code',
      how = 'left'
    )
  # Plant level data
  oge_plant_dt = 
    map_dfr(
      oge_plant_files_dt[nerc_adj == nerc_adj_in]$file,
      \(fp){
        fread(fp)[,.(
          plant_id_eia, 
          datetime_utc, #= ymd_hms(datetime_utc), 
          year = oge_plant_files_dt[file == fp]$year,
          net_generation_mwh,
          fuel_consumed_for_electricity_mmbtu,
          co2e_mass_lb_for_electricity = 
            # Using AR5 100 year "climate-carbon feedback" GWP's
            co2_mass_lb_for_electricity +
            34*ch4_mass_lb_for_electricity +
            298*n2o_mass_lb_for_electricity,
          nox_mass_lb_for_electricity,
          so2_mass_lb_for_electricity
        )]
      }
    )[!is.na(plant_id_eia)] |> 
    setkey(plant_id_eia, datetime_utc)
  # Make sure datetime isnt missing
  if(nrow(oge_plant_dt[is.na(datetime_utc)]) > 0){
    stop(paste0('oge_plant_dt missing datetime_utc for ',nerc_adj_in))
  }
  # Load data 
  colnames_vec_in = c('nerc_adj', 'datetime_utc', eload_var)
  oge_nerc_load_dt = 
    read.fst(
      path = here("Data/electricity-generation/clean-load/nerc-load-dt-oge.fst"),
      as.data.table = TRUE
    )[!(nerc_adj %in% c('AK','HI','ASCC')) 
      & datetime_utc >= fmin(oge_plant_dt$datetime_utc)
      & datetime_utc <= fmax(oge_plant_dt$datetime_utc),
      ..colnames_vec_in
    ] |>
    setnames(old = eload_var, new = 'excess_load') %>%
    .[,excess_load_sq := excess_load^2] |>
    dcast(
      formula = datetime_utc ~ nerc_adj,
      value.var = c('excess_load','excess_load_sq')
    ) |> 
    clean_names() |>
    setkey(datetime_utc) 
  # Plant info table 
  oge_plant_info_dt =
    read.fst(
      path = here("Data/electricity-generation/plant-info-dt-oge.fst"),
      as.data.table = TRUE
    )[!(nerc_adj %in% c('AK','HI','ASCC'))
    ,.(
      plant_id_eia, 
      year,
      ba_code, 
      nerc_adj,
      #shaped_plant_id,
      fuel_category,
      namepcap
    )] |> setkey(plant_id_eia, year)  
# Filling in missing data with zeros ------------------------------------------
  print('cleaning')
  # Min and max hours in the raw data
  plant_open_close_dt = 
    oge_plant_dt[
      net_generation_mwh != 0
      ,.( 
        first_prod_date = min(datetime_utc),
        last_prod_date = max(datetime_utc),
        tot_prod = sum(net_generation_mwh)
      ),
      keyby = .(plant_id_eia)
    ]
  plant_open_close_dt[,.(
    `Percent of plants open throughout` = sum(fifelse(
    first_prod_date >= lubridate::ymd_hms('2019-02-01 00:00:00')
    | last_prod_date <= lubridate::ymd_hms('2019-12-01 00:00:00'),
    0, tot_prod))/sum(tot_prod)
  )]
  # Creating table with every plant and hour
  plant_hour_dt = 
    map_dfr(
      1:nrow(plant_open_close_dt),
      \(i){
        data.table(
          plant_id_eia = plant_open_close_dt$plant_id_eia[i],
          datetime_utc = seq(
            plant_open_close_dt$first_prod_date[i],
            plant_open_close_dt$last_prod_date[i], 
            by = 'hour'
          )
        )
      }
    ) |> setkey(plant_id_eia, datetime_utc)
  # Merging with the raw data
  oge_plant_dt = 
    join(
      plant_hour_dt,
      oge_plant_dt,
      on = c('plant_id_eia','datetime_utc'),
      how = 'left'
    )
  # Setting NAs to zero 
  oge_plant_dt[,':='(
    net_generation_mwh = fifelse(
      is.na(net_generation_mwh), 0, 
      net_generation_mwh
    ),
    fuel_consumed_for_electricity_mmbtu = fifelse(
      is.na(fuel_consumed_for_electricity_mmbtu), 0, 
      fuel_consumed_for_electricity_mmbtu
    ),
    co2e_mass_lb_for_electricity = fifelse(
      is.na(co2e_mass_lb_for_electricity), 0, 
      co2e_mass_lb_for_electricity
    ),
    nox_mass_lb_for_electricity = fifelse(
      is.na(nox_mass_lb_for_electricity), 0, 
      nox_mass_lb_for_electricity
    ),
    so2_mass_lb_for_electricity = fifelse(
      is.na(so2_mass_lb_for_electricity), 0, 
      so2_mass_lb_for_electricity
    ),
    year = fifelse(
      is.na(year), year(datetime_utc), 
      as.numeric(year)
    )
  )]
  # Loading transmission constraint flag 
  transmission_constraint_dt = 
    read.fst(
      path = here("Data/electricity-generation/clean-load/nerc-load-dt-oge.fst"),
      as.data.table = TRUE
    )[nerc_adj == nerc_adj_in 
      & datetime_utc >= fmin(oge_plant_dt$datetime_utc)
      & datetime_utc <= fmax(oge_plant_dt$datetime_utc),
      .(nerc_adj, datetime_utc, above_transmission_constraint = excess_load_high)
    ]  
# Merging together ------------------------------------------------------------
  print('merging')
  oge_elec_gen_dt = 
    join(
      oge_nerc_load_dt,
      oge_plant_dt,
      on = 'datetime_utc', 
      how = 'right'
    )[!is.na(excess_load_cal)] |> 
    join(
      oge_plant_info_dt, 
      on = c('plant_id_eia','year'),
      how = 'left'
    ) |>
    join(
      transmission_constraint_dt, 
      on = c('nerc_adj','datetime_utc'), 
      how = 'left'
    )
  # Setting production to namepcap if it exceeds it, 0 if negative 
  oge_elec_gen_dt[,':='(
    net_generation_mwh_raw = net_generation_mwh,
    above_namepcap = net_generation_mwh > namepcap, 
    negative_generation = net_generation_mwh < 0
  )]
  oge_elec_gen_dt[,
    net_generation_mwh := fcase(
      above_namepcap == TRUE, namepcap, 
      negative_generation == TRUE, 0,
      above_namepcap == FALSE & negative_generation == FALSE,
      net_generation_mwh
  )]  
  # make sure there is no missing data 
  if(nrow(oge_elec_gen_dt[is.na(net_generation_mwh)]) > 0){
    stop('net_generation_mwh is missing for some rows of oge_elec_gen_dt')
  }
  # Net generation by energy source 
  oge_elec_gen_dt[,.(
    tot_ng = sum(net_generation_mwh),
    tot_co2 = sum(co2e_mass_lb_for_electricity),
    tot_so2 = sum(so2_mass_lb_for_electricity),
    tot_nox = sum(nox_mass_lb_for_electricity), 
    xmis = mean(above_transmission_constraint), 
    neg = mean(negative_generation)
  ), keyby = fuel_category]
  
# Saving the results ----------------------------------------------------------
  if(str_detect(eload_var,'loss')){
    path_in = here(paste0(
      "Data/electricity-generation/elec-gen-dt/elec-gen-dt-line-losses",
      tolower(nerc_adj_in),".fst"
    ))
  }else{
    path_in = here(paste0(
      "Data/electricity-generation/elec-gen-dt/elec-gen-dt-",tolower(nerc_adj_in),".fst"
    ))
  }
  write.fst(
    oge_elec_gen_dt,
    path = path_in
  )
  print('done.')
}

# Running each region
merge_plant_load = function(
  nerc_adj_in = c('CAL','MRO','NPCC','RFC','SERC','TRE','WECC')
){
  map(
    nerc_adj_in,
    merge_plant_load_nerc
  )
}
